1 Data Generation - The pipeline

1.1 Preprocessing

  1. Read April 2013 data and March 17 data.
  2. Do some basic preprocessing
  • Assign ids
  • Calculate age from birth year
  • Add scaled frequencies
## # A tibble: 16,271 x 71
##    county_id county_desc voter_reg_num status_cd voter_status_desc
##        <int> <chr>       <chr>         <chr>     <chr>            
##  1       100 YANCEY      000000014683  A         ACTIVE           
##  2       100 YANCEY      000000014722  A         ACTIVE           
##  3       100 YANCEY      000000010135  A         ACTIVE           
##  4       100 YANCEY      000000022654  A         ACTIVE           
##  5       100 YANCEY      000000020123  R         REMOVED          
##  6       100 YANCEY      000000017105  R         REMOVED          
##  7       100 YANCEY      000000014537  A         ACTIVE           
##  8       100 YANCEY      000000019234  A         ACTIVE           
##  9       100 YANCEY      000000002043  A         ACTIVE           
## 10       100 YANCEY      000000019138  A         ACTIVE           
## # ... with 16,261 more rows, and 66 more variables: reason_cd <chr>,
## #   voter_status_reason_desc <chr>, absent_ind <chr>, name_prefx_cd <chr>,
## #   last_name <chr>, first_name <chr>, midl_name <chr>, name_suffix <chr>,
## #   res_street_address <chr>, res_city_desc <chr>, state_cd <chr>,
## #   zip_code <int>, mail_addr1 <chr>, mail_addr2 <chr>, mail_addr3 <chr>,
## #   mail_addr4 <chr>, mail_city <chr>, mail_state <chr>,
## #   mail_zipcode <int>, full_phone_number <chr>, race_code <chr>,
## #   ethnic_code <chr>, party_cd <chr>, gender_code <chr>, birth_age <int>,
## #   birth_place <chr>, registr_dt <chr>, precinct_abbrv <chr>,
## #   precinct_desc <chr>, municipality_abbrv <chr>,
## #   municipality_desc <chr>, ward_abbrv <chr>, ward_desc <chr>,
## #   cong_dist_abbrv <chr>, super_court_abbrv <chr>,
## #   judic_dist_abbrv <chr>, nc_senate_abbrv <chr>, nc_house_abbrv <chr>,
## #   county_commiss_abbrv <chr>, county_commiss_desc <chr>,
## #   township_abbrv <chr>, township_desc <chr>, school_dist_abbrv <chr>,
## #   school_dist_desc <chr>, fire_dist_abbrv <chr>, fire_dist_desc <chr>,
## #   water_dist_abbrv <chr>, water_dist_desc <chr>, sewer_dist_abbrv <chr>,
## #   sewer_dist_desc <chr>, sanit_dist_abbrv <chr>, sanit_dist_desc <chr>,
## #   rescue_dist_abbrv <chr>, rescue_dist_desc <chr>,
## #   munic_dist_abbrv <chr>, munic_dist_desc <chr>, dist_1_abbrv <chr>,
## #   dist_1_desc <chr>, dist_2_abbrv <chr>, dist_2_desc <chr>,
## #   confidential_ind <chr>, age <chr>, ncid <chr>, vtd_abbrv <chr>,
## #   vtd_desc <chr>, id_a <int>
## # A tibble: 16,276 x 72
##    county_id county_desc voter_reg_num status_cd voter_status_desc
##        <int> <chr>       <chr>         <chr>     <chr>            
##  1       100 YANCEY      000000026581  A         ACTIVE           
##  2       100 YANCEY      000000014683  A         ACTIVE           
##  3       100 YANCEY      000000014722  A         ACTIVE           
##  4       100 YANCEY      000000010135  A         ACTIVE           
##  5       100 YANCEY      000000025582  A         ACTIVE           
##  6       100 YANCEY      000000022654  I         INACTIVE         
##  7       100 YANCEY      000000020123  R         REMOVED          
##  8       100 YANCEY      000000026587  A         ACTIVE           
##  9       100 YANCEY      000000017105  R         REMOVED          
## 10       100 YANCEY      000000026924  A         ACTIVE           
## # ... with 16,266 more rows, and 67 more variables: reason_cd <chr>,
## #   voter_status_reason_desc <chr>, absent_ind <chr>, name_prefx_cd <chr>,
## #   last_name <chr>, first_name <chr>, middle_name <chr>,
## #   name_suffix <chr>, res_street_address <chr>, res_city_desc <chr>,
## #   state_cd <chr>, zip_code <int>, mail_addr1 <chr>, mail_addr2 <chr>,
## #   mail_addr3 <chr>, mail_addr4 <chr>, mail_city <chr>, mail_state <chr>,
## #   mail_zipcode <chr>, full_phone_number <dbl>, race_code <chr>,
## #   ethnic_code <chr>, party_cd <chr>, gender_code <chr>, birth_age <int>,
## #   birth_state <chr>, drivers_lic <chr>, registr_dt <chr>,
## #   precinct_abbrv <chr>, precinct_desc <chr>, municipality_abbrv <chr>,
## #   municipality_desc <chr>, ward_abbrv <chr>, ward_desc <chr>,
## #   cong_dist_abbrv <int>, super_court_abbrv <int>,
## #   judic_dist_abbrv <int>, nc_senate_abbrv <int>, nc_house_abbrv <int>,
## #   county_commiss_abbrv <chr>, county_commiss_desc <chr>,
## #   township_abbrv <chr>, township_desc <chr>, school_dist_abbrv <chr>,
## #   school_dist_desc <chr>, fire_dist_abbrv <chr>, fire_dist_desc <chr>,
## #   water_dist_abbrv <chr>, water_dist_desc <chr>, sewer_dist_abbrv <chr>,
## #   sewer_dist_desc <chr>, sanit_dist_abbrv <chr>, sanit_dist_desc <chr>,
## #   rescue_dist_abbrv <chr>, rescue_dist_desc <chr>,
## #   munic_dist_abbrv <chr>, munic_dist_desc <chr>, dist_1_abbrv <int>,
## #   dist_1_desc <chr>, dist_2_abbrv <chr>, dist_2_desc <chr>,
## #   confidential_ind <chr>, age <chr>, ncid <chr>, vtd_abbrv <chr>,
## #   vtd_desc <chr>, id_b <int>
## # A tibble: 16,271 x 10
##     id_a fname   lname     birth_age gender_code race_code voter_reg_num
##    <int> <chr>   <chr>         <int> <chr>       <chr>     <chr>        
##  1     1 KAREN   ABERNATHY        62 F           W         000000014683 
##  2     2 TRAVIS  ABERNATHY        70 M           W         000000014722 
##  3     3 RONALD  ABRAHAM          63 M           W         000000010135 
##  4     4 DARCI   ABRAMS           33 F           W         000000022654 
##  5     5 LYDIA   ABRAMS           44 F           W         000000020123 
##  6     6 JOSHUA  ACUFF            27 M           W         000000017105 
##  7     7 BARBARA ADAMS            65 F           W         000000014537 
##  8     8 CAROLYN ADAMS            63 F           W         000000019234 
##  9     9 DAVID   ADAMS            57 M           W         000000002043 
## 10    10 JESSICA ADAMS            34 F           W         000000019138 
## # ... with 16,261 more rows, and 3 more variables: birth_year <dbl>,
## #   ffreq <dbl>, lfreq <dbl>
## # A tibble: 16,276 x 10
##     id_b fname   lname     birth_age gender_code race_code voter_reg_num
##    <int> <chr>   <chr>         <int> <chr>       <chr>     <chr>        
##  1     1 PAMELA  ABARE            49 F           W         000000026581 
##  2     2 KAREN   ABERNATHY        66 F           W         000000014683 
##  3     3 TRAVIS  ABERNATHY        73 M           W         000000014722 
##  4     4 RONALD  ABRAHAM          67 M           W         000000010135 
##  5     5 BRIAN   ABRAM            36 M           W         000000025582 
##  6     6 DARCI   ABRAMS           36 F           W         000000022654 
##  7     7 LYDIA   ABRAMS           48 F           W         000000020123 
##  8     8 MALINDA ACHARD           43 F           W         000000026587 
##  9     9 JOSHUA  ACUFF            31 M           W         000000017105 
## 10    10 BARBARA ADAMS            78 F           W         000000026924 
## # ... with 16,266 more rows, and 3 more variables: birth_year <dbl>,
## #   ffreq <dbl>, lfreq <dbl>

1.2 Match generation

  1. Extract exact matches
  2. Extract voter id matches
  3. Filter out exact matches from voter id matches -> matches
## # A tibble: 12,502 x 2
##     id_a  id_b
##    <int> <int>
##  1     1     2
##  2     3     4
##  3     5     7
##  4     6     9
##  5     7    11
##  6     8    12
##  7    14    17
##  8    15    19
##  9    17    20
## 10    18    21
## # ... with 12,492 more rows
## # A tibble: 13,495 x 3
##     id_a  id_b match
##    <int> <int> <chr>
##  1     1     2 match
##  2     2     3 match
##  3     3     4 match
##  4     4     6 match
##  5     5     7 match
##  6     6     9 match
##  7     7    11 match
##  8     8    12 match
##  9    10 11572 match
## 10    12    15 match
## # ... with 13,485 more rows
## # A tibble: 1,271 x 22
##     id_a  id_b match fname_a lname_a birth_age_a gender_code_a race_code_a
##    <int> <int> <chr> <chr>   <chr>         <int> <chr>         <chr>      
##  1     2     3 match TRAVIS  ABERNA…          70 M             W          
##  2     4     6 match DARCI   ABRAMS           33 F             W          
##  3    10 11572 match JESSICA ADAMS            34 F             W          
##  4    12    15 match KELSEY  ADAMS            22 F             W          
##  5    20    22 match CONNIE  ADKINS           52 F             W          
##  6    27  6956 match FELICIA ADKINS           51 F             W          
##  7    44 12560 match STACY   ADKINS           35 F             W          
##  8    51    48 match BETHANY ADKINS…          30 F             W          
##  9    62    57 match KELLY   AKINS            39 F             W          
## 10    85    82 match AMBER   ALLEN            37 F             W          
## # ... with 1,261 more rows, and 14 more variables: voter_reg_num_a <chr>,
## #   birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## #   lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## #   race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## #   ffreq_b <dbl>, lfreq_b <dbl>, pair_id <int>

1.3 Unmatch generation

df_match_block <- 
  df_matches_unexact %>% 
  select(matches(or("id", "fname", "lname"))) %>% 
  mutate(fname_soundex_a = soundex(fname_a),
         fname_soundex_b = soundex(fname_b),
         fname_dm_a = map(fname_a, DoubleMetaphone),
         fname_dm1_a = map_chr(fname_dm_a, 1),
         fname_dm2_a = map_chr(fname_dm_a, 2),
         fname_dm_b = map(fname_b, DoubleMetaphone),
         fname_dm1_b = map_chr(fname_dm_b, 1),
         fname_dm2_b = map_chr(fname_dm_b, 2),
         lname_soundex_a = soundex(lname_a),
         lname_soundex_b = soundex(lname_b),
         lname_dm_a = map(lname_a, DoubleMetaphone),
         lname_dm1_a = map_chr(lname_dm_a, 1),
         lname_dm2_a = map_chr(lname_dm_a, 2),
         lname_dm_b = map(lname_b, DoubleMetaphone),
         lname_dm1_b = map_chr(lname_dm_b, 1),
         lname_dm2_b = map_chr(lname_dm_b, 2)) %>% 
  select(-fname_dm_a, -fname_dm_b, -lname_dm_a, -lname_dm_b)

df_a_block <- 
  df_a_mod %>% 
  select(id_a, fname, lname) %>% 
  rename(fname_a = fname,
         lname_a = lname) %>% 
  mutate(fname_soundex_a = soundex(fname_a),
         fname_dm_a = map(fname_a, DoubleMetaphone),
         fname_dm1_a = map_chr(fname_dm_a, 1),
         fname_dm2_a = map_chr(fname_dm_a, 2),
         lname_soundex_a = soundex(lname_a),
         lname_dm_a = map(lname_a, DoubleMetaphone),
         lname_dm1_a = map_chr(lname_dm_a, 1),
         lname_dm2_a = map_chr(lname_dm_a, 2)) %>% 
  select(-fname_dm_a, -lname_dm_a)
  

df_b_block <- 
  df_b_mod %>% 
  select(id_b, fname, lname) %>% 
  rename(fname_b = fname,
         lname_b = lname) %>% 
  mutate(fname_soundex_b = soundex(fname_b),
         fname_dm_b = map(fname_b, DoubleMetaphone),
         fname_dm1_b = map_chr(fname_dm_b, 1),
         fname_dm2_b = map_chr(fname_dm_b, 2),
         lname_soundex_b = soundex(lname_b),
         lname_dm_b = map(lname_b, DoubleMetaphone),
         lname_dm1_b = map_chr(lname_dm_b, 1),
         lname_dm2_b = map_chr(lname_dm_b, 2)) %>% 
  select(-fname_dm_b, -lname_dm_b)

df_match_block_a <- 
  df_match_block %>% 
  select(contains("_a")) 

df_match_block_b <- 
  df_match_block %>% 
  select(contains("_b")) 

df_all_combos <- 
  bind_rows(
      df_match_block_a %>% 
      inner_join(df_b_block, by = c("fname_soundex_a" = "fname_soundex_b")) %>% 
      select(id_a, id_b),
      
    df_match_block_a %>% 
      inner_join(df_b_block, by = c("fname_dm1_a" = "fname_dm1_b")) %>% 
      select(id_a, id_b),
    
    df_match_block_a %>% 
      inner_join(df_b_block, by = c("fname_dm2_a" = "fname_dm2_b")) %>% 
      select(id_a, id_b),
    
    df_match_block_a %>% 
      inner_join(df_b_block, by = c("lname_soundex_a" = "lname_soundex_b")) %>% 
      select(id_a, id_b),
    
    df_match_block_a %>% 
      inner_join(df_b_block, by = c("lname_dm1_a" = "lname_dm1_b")) %>% 
      select(id_a, id_b),
    
    df_match_block_a %>% 
      inner_join(df_b_block, by = c("lname_dm2_a" = "lname_dm2_b")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("fname_soundex_b" = "fname_soundex_a")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("fname_dm1_b" = "fname_dm1_a")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("fname_dm2_b" = "fname_dm2_a")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("lname_soundex_b" = "lname_soundex_a")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("lname_dm1_b" = "lname_dm1_a")) %>% 
      select(id_a, id_b),
    
    df_match_block_b %>% 
      inner_join(df_a_block, by = c("lname_dm2_b" = "lname_dm2_a")) %>% 
      select(id_a, id_b)
  ) %>% 
  distinct()


df_all_combos_nested <- 
  df_all_combos %>% 
  left_join(df_a_mod, by = "id_a") %>% 
  left_join(df_b_mod, by = "id_b", suffix = c("_a", "_b")) %>% 
  group_by(id_a, id_b) %>% 
  nest()

n_unique <- function(x) x %>% unique() %>% length()
rename_weight <- function(x) str_c(x, "_weight")
df_to_vector <- function(df) df %>% .[1, ] %>% unclass() %>% as.double()

weight_vector <- 
  df_a_mod %>% 
  bind_rows(df_b_mod) %>% 
  select(fname, gender_code, race_code, birth_year) %>% 
  summarise_all(n_unique) %>% 
  rename_all(rename_weight) %>% 
  mutate(sum = 
           fname_weight + 
           # lname_weight + 
           gender_code_weight +
           race_code_weight +
           birth_year_weight) %>% 
  mutate_all(function(x, all) x/all, all = .$sum) %>% 
  select(-sum) %>% 
  df_to_vector()


df_all_combos_nested_sim <- 
  df_all_combos_nested %>% 
  mutate(x = map_dbl(data, calculate_hamming_fields))

df_unmatches_unexact <- 
  df_all_combos_nested_sim %>% 
  semi_join(df_matches_unexact, by = "id_a") %>% 
  group_by(id_a) %>% 
  arrange(desc(x), .by_group = T) %>% 
  slice(1:5) %>% 
  ungroup() %>% 
  select(starts_with("id")) %>% 
  left_join(df_a_mod, by = "id_a") %>%
  left_join(df_b_mod, by = "id_b", suffix = c("_a", "_b")) %>%
  mutate(pair_id = 986398  + row_number()) %>%  
  mutate(match = "unmatch")

(df_unmatches_unexact %>% 
  write_rds(data_path_file("df_unmatches_unexact.rds")))
## # A tibble: 6,347 x 22
##     id_a  id_b fname_a lname_a   birth_age_a gender_code_a race_code_a
##    <int> <int> <chr>   <chr>           <int> <chr>         <chr>      
##  1     2     3 TRAVIS  ABERNATHY          70 M             W          
##  2     2   626 TRAVIS  ABERNATHY          70 M             W          
##  3     2   799 TRAVIS  ABERNATHY          70 M             W          
##  4     2  1982 TRAVIS  ABERNATHY          70 M             W          
##  5     2  2113 TRAVIS  ABERNATHY          70 M             W          
##  6     4     6 DARCI   ABRAMS             33 F             W          
##  7     4  9757 DARCI   ABRAMS             33 F             W          
##  8     4  1080 DARCI   ABRAMS             33 F             W          
##  9     4  2265 DARCI   ABRAMS             33 F             W          
## 10     4  2398 DARCI   ABRAMS             33 F             W          
## # ... with 6,337 more rows, and 15 more variables: voter_reg_num_a <chr>,
## #   birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## #   lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## #   race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## #   ffreq_b <dbl>, lfreq_b <dbl>, pair_id <dbl>, match <chr>

1.4 Pair Generation

## # A tibble: 7,618 x 22
##     id_a  id_b fname_a      lname_a  birth_age_a gender_code_a race_code_a
##    <int> <int> <chr>        <chr>          <int> <chr>         <chr>      
##  1  4730  4650 DARRELL      FORBES            49 M             W          
##  2  7031  4254 GARRY        HOPSON            43 M             W          
##  3 10995  4867 CARMON       PETERSON          57 M             W          
##  4  7090  7129 LISA         HOUSLEY           38 F             W          
##  5  3576  3621 NORWOOD      CUMMINS…          72 M             W          
##  6  6037  6093 CHARLES      HARRIS            84 M             W          
##  7 10527 10552 AMANDA       NORTON            28 F             W          
##  8 12752   153 MARGARET - … RUTKOWS…          31 F             W          
##  9 12166  7935 MARTHA       RIDDLE            74 F             W          
## 10  1151   283 AMANDA       BENNETT           66 F             W          
## # ... with 7,608 more rows, and 15 more variables: voter_reg_num_a <chr>,
## #   birth_year_a <dbl>, ffreq_a <dbl>, lfreq_a <dbl>, fname_b <chr>,
## #   lname_b <chr>, birth_age_b <int>, gender_code_b <chr>,
## #   race_code_b <chr>, voter_reg_num_b <chr>, birth_year_b <dbl>,
## #   ffreq_b <dbl>, lfreq_b <dbl>, pair_id <dbl>, match <chr>
## # A tibble: 15,236 x 12
##    pair_id fname   lname     gender_code race_code birth_year match    id
##      <dbl> <chr>   <chr>     <chr>       <chr>          <dbl> <chr> <int>
##  1       1 TRAVIS  ABERNATHY M           W               1943 match     2
##  2       1 TRAVIS  ABERNATHY M           W               1944 match     3
##  3       2 DARCI   ABRAMS    F           W               1980 match     4
##  4       2 DARCI   ABRAMS    F           W               1981 match     6
##  5       3 JESSICA ADAMS     F           W               1979 match    10
##  6       3 JESSICA RAHN      F           W               1979 match 11572
##  7       4 KELSEY  ADAMS     F           W               1991 match    12
##  8       4 KELSEY  ADAMS     F           W               1992 match    15
##  9       5 CONNIE  ADKINS    F           W               1961 match    20
## 10       5 CONNIE  ADKINS    F           W               1962 match    22
## # ... with 15,226 more rows, and 4 more variables: birthge_a <int>,
## #   voter_reg_num <chr>, ffreq <dbl>, lfreq <dbl>

2 Disturbing data

The above pipeline was wrapped into a function, so that database A can be made noisy and the pair generation can be repeated.

df_error_table <- 
  read_csv("R/paper/error_table.csv")

set.seed(13)
df_messed_collection <- 
  tibble(error_rate = seq(0, 40, 10)) %>% 
    mutate(df_a_mod = map(error_rate, function(e){
      # browser()
      df_a_mod %>%
        prep_data() %>%
        mess_data(df_error_table %>% 
                    mutate(amount = amount*e/10)) %>% 
        pluck("df_secondary") %>% 
        select(-file, -id) %>% 
        mutate_if(is.character, str_to_upper) %>% 
        mutate(birth_year = as.integer(birth_year)) %>% 
        select(-ffreq, -lfreq) %>% 
        add_count(fname) %>%
        rename(ffreq = n) %>% 
        add_count(lname) %>% 
        rename(lfreq = n) %>% 
        mutate(ffreq = scale(ffreq),
               lfreq = scale(lfreq))
    }))

df_messed_collection <- 
  df_messed_collection %>% 
  mutate(df_pairs = map2(df_a_mod, error_rate, function(df_a_mod, e) {
    message("________________________")
    message(glue("Error rate: {e}%"))
    
    generate_pairs(df_a_mod = df_a_mod, 
                   df_b_mod = df_b_mod, 
                   data_pref = 
                     e %>% 
                     str_pad(width = 2, side = "left", pad = "0") %>% 
                     str_c("err_", .))
  }))

df_messed_collection %>%
  write_rds("data/paper/error_data/df_messed_collection.rds")

df_messed_collection
## # A tibble: 5 x 3
##   error_rate df_a_mod               df_pairs              
##        <dbl> <list>                 <list>                
## 1        0   <tibble [16,271 × 10]> <tibble [7,618 × 22]> 
## 2        0.1 <tibble [16,271 × 10]> <tibble [14,391 × 22]>
## 3        0.2 <tibble [16,271 × 10]> <tibble [18,932 × 22]>
## 4        0.3 <tibble [16,271 × 10]> <tibble [23,588 × 22]>
## 5        0.4 <tibble [16,271 × 10]> <tibble [27,735 × 22]>

3 Feature Engineering

Using a function called add_feature_vector to generate features.

df_messed_collection <-
  df_messed_collection %>% 
  mutate(df_feature = map(df_pairs, function(df_pair){
    df_pair %>%
      add_feature_vector() %>%
      select(starts_with("metric"), match) %>%
      mutate(match = match %>% factor(levels = c("unmatch", "match"))) %>%
      as.data.frame()
  }))
  

df_messed_collection %>%
  write_rds("data/paper/error_data/df_feature_collection.rds")
  
df_messed_collection
## # A tibble: 5 x 4
##   error_rate df_a_mod               df_pairs               df_feature     
##        <dbl> <list>                 <list>                 <list>         
## 1        0   <tibble [16,271 × 10]> <tibble [7,618 × 22]>  <data.frame [7…
## 2        0.1 <tibble [16,271 × 10]> <tibble [14,391 × 22]> <data.frame [1…
## 3        0.2 <tibble [16,271 × 10]> <tibble [18,932 × 22]> <data.frame [1…
## 4        0.3 <tibble [16,271 × 10]> <tibble [23,588 × 22]> <data.frame [2…
## 5        0.4 <tibble [16,271 × 10]> <tibble [27,735 × 22]> <data.frame [2…

One df_feature dataframe looks like these.

## # A tibble: 7,618 x 16
##    metric_gender_co… metric_gender_co… metric_gender_cod… metric_race_cod…
##                <int>             <int> <lgl>              <lgl>           
##  1                 0                 2 TRUE               TRUE            
##  2                 0                 2 TRUE               TRUE            
##  3                 0                 2 TRUE               TRUE            
##  4                 2                 0 TRUE               TRUE            
##  5                 0                 2 TRUE               TRUE            
##  6                 0                 2 TRUE               TRUE            
##  7                 2                 0 TRUE               TRUE            
##  8                 2                 0 TRUE               TRUE            
##  9                 2                 0 TRUE               TRUE            
## 10                 2                 0 TRUE               TRUE            
## # ... with 7,608 more rows, and 12 more variables:
## #   metric_race_code_ww_bb <lgl>, metric_year_diff_abs <int>,
## #   metric_age <dbl>, metric_ffreq_mean <dbl>, metric_lfreq_mean <dbl>,
## #   metric_fname_dl <dbl>, metric_fname_jw <dbl>,
## #   metric_fname_soundex <dbl>, metric_lname_dl <dbl>,
## #   metric_lname_jw <dbl>, metric_lname_soundex <dbl>, match <fct>

4 Modeling

4.1 Modeling setup

  
df_messed_collection <- 
  df_messed_collection %>% 
  mutate(
    model_rf = map(df_train, function(df_tr){
      # browser()
      set.seed(3)
      train(match ~ .,
            df_tr,
            trControl = train_control,
            tuneGrid = expand.grid(.mtry = seq(1, 6, 1)),
            importance = TRUE,
            keep.forest= TRUE,
            ntree = 350,
            method = "rf")
    }),
    model_svm_radial = map(df_train, function(df_tr){
        # browser()
        set.seed(3)
        train(match ~ .,
              df_tr,
              trControl = train_control,
              method = "svmRadial")
      }),
    model_svm_linear = map(df_train, function(df_tr){
      # browser()
      set.seed(3)
      train(match ~ .,
            df_tr,
            trControl = train_control,
            method = "svmLinear") 
  })
  )



df_messed_collection %>% 
  write_rds("data/paper/error_data/df_messed_collection_models.rds")
  
df_messed_collection
## # A tibble: 5 x 11
##   error_rate df_a_mod   df_pairs   df_feature   train_indices test_indices
##        <dbl> <list>     <list>     <list>       <list>        <list>      
## 1        0   <tibble [… <tibble [… <data.frame… <int [5,000]> <int [2,618…
## 2        0.1 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [9,391…
## 3        0.2 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [13,93…
## 4        0.3 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [18,58…
## 5        0.4 <tibble [… <tibble [… <data.frame… <int [5,000]> <int [22,73…
## # ... with 5 more variables: df_train <list>, df_test <list>,
## #   model_rf <list>, model_svm_radial <list>, model_svm_linear <list>